library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
library(knitr)
library(gtsummary)
library(tidyr)
library(readr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(sf)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
library(mapview)
ufo_sightings_transformed <- read_csv("ufo-sightings-transformed.csv")
## New names:
## Rows: 80328 Columns: 17
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (9): date_documented, Season, Country_Code, Country, Region, Locale, UF... dbl
## (7): ...1, Year, Month, Hour, latitude, longitude, length_of_encounter_... dttm
## (1): Date_time
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
ufo_sightings_transformed <- ufo_sightings_transformed %>%
mutate(Season = factor(Season),
Country_Code = factor(Country_Code),
Country = factor(Country),
UFO_shape = factor(UFO_shape),
Country = factor(Country),
Year = factor(Year),
Hour = factor(Hour),
Region = factor(Region)
)
library(dplyr)
library(gtsummary)
# Create the summary table
UFO_summary_table <- ufo_sightings_transformed %>%
select(length_of_encounter_seconds, Season,Month) %>%
gtsummary::tbl_summary(
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)"),
)
# Print the summary table
UFO_summary_table
| Characteristic | N = 80,3281 |
|---|---|
| length_of_encounter_seconds | 9,017 (620,232) |
| Season | |
| Â Â Â Â Autumn | 21,735 (27%) |
| Â Â Â Â Spring | 16,268 (20%) |
| Â Â Â Â Summer | 26,307 (33%) |
| Â Â Â Â Winter | 16,018 (20%) |
| Month | 6.8 (3.2) |
| 1 Mean (SD); n (%) | |
You can also embed plots, for example:
# Install and load necessary packages
if (!requireNamespace("ggplot2", quietly = TRUE)) {
install.packages("ggplot2")
}
library(ggplot2)
# Create a bar plot with reduced width
ggplot(ufo_sightings_transformed, aes(x = Season)) +
geom_bar(fill = "skyblue", color = "darkblue", width = 0.5) +
labs(title = "Bar Plot For UFO Sightings appeared in Different Seasons", x = "Season", y = "Count") +
theme_minimal()
ufo_sightings_transformed <- na.omit(ufo_sightings_transformed)
\(H_0:\) The average UFO sighting in California Region is equal to that of the average UFO sightings in England Region.
\(\mu_{cal}=\mu_{eng}\)
\(H_A:\) The average UFO sighting in California Region is greater than that of the average UFO sightings in England Region.
\(\mu_{cal} > \mu_{eng}\)
california_england_data <- ufo_sightings_transformed %>%
filter(Region %in% c("California", "England")) %>%
filter(!is.na(Region)) %>%
filter(length_of_encounter_seconds >= 0.1, length_of_encounter_seconds <= 15000, is.finite(length_of_encounter_seconds))
# Sample 1000 rows including both regions
sampled_data <- california_england_data %>%
group_by(Region) %>%
sample_n(500) %>%
ungroup()
sampled_data %>%
ggplot(aes(length_of_encounter_seconds, fill = Region)) +
geom_histogram(binwidth = 200, col = "white", show.legend = FALSE) +
facet_wrap(~ Region) +
labs(title = "Length of Encounter Seconds vs. Region") +
scale_x_continuous(breaks = seq(0, 5000, by = 500), limits = c(0, 5000))
## Warning: Removed 24 rows containing non-finite values (`stat_bin()`).
## Warning: Removed 4 rows containing missing values (`geom_bar()`).
California_ufo_sightings <- ufo_sightings_transformed %>%
filter(Region == "California", !is.na(length_of_encounter_seconds)) %>%
select(length_of_encounter_seconds) %>%
pull()
england_ufo_sightings <- ufo_sightings_transformed %>%
filter(Region == "England", !is.na(length_of_encounter_seconds)) %>%
select(length_of_encounter_seconds) %>%
pull()
n1 <- length(California_ufo_sightings)
n2 <- length(england_ufo_sightings)
\(n_1\) = 9374
\(n_2\) = 1885
xbar1 <- mean(California_ufo_sightings)
xbar2 <- mean(england_ufo_sightings)
s1 <- sd(California_ufo_sightings)
s2 <- sd(england_ufo_sightings)
stat <- xbar1 - xbar2
null_value <- 0
se <- sqrt(s1^2/n1 + s2^2/n2)
df <- min(c(n1, n2)) - 1
t_stat <- (stat - null_value) / se
\(\mu_{cal}\)=3478.5996
\(\mu_{eng}\)=6.6377716^{4}
\(t_{stat}\) = -1.1732
p_val <- pt(t_stat, df = df, lower.tail = FALSE)
\(p-value\) = 0.8796
Decision:Fail to reject \(H_0\)
Conclusion: We do not enough evidence that the difference in average of UFO sightings in California is greater than that average of UFO sightings in England.
t.test(California_ufo_sightings, england_ufo_sightings, alternative = "greater",
conf.level = 0.95)
##
## Welch Two Sample t-test
##
## data: California_ufo_sightings and england_ufo_sightings
## t = -1.1732, df = 1886.3, p-value = 0.8796
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## -151126.7 Inf
## sample estimates:
## mean of x mean of y
## 3478.60 66377.72